In [ ]:
!pip install pyspark
Requirement already satisfied: pyspark in c:\users\user\appdata\local\packages\pythonsoftwarefoundation.python.3.11_qbz5n2kfra8p0\localcache\local-packages\python311\site-packages (3.5.1)
Requirement already satisfied: py4j==0.10.9.7 in c:\users\user\appdata\local\packages\pythonsoftwarefoundation.python.3.11_qbz5n2kfra8p0\localcache\local-packages\python311\site-packages (from pyspark) (0.10.9.7)
In [ ]:
from pyspark.sql import *
from pyspark.sql import functions as func
spark = SparkSession.builder.getOrCreate()
In [ ]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("abd") \
    .getOrCreate()


fashion_accessories_df = "hdfs://localhost:9000/abd/All_Category/fashion__accessories.txt"
data_storage_df = "hdfs://localhost:9000/abd/All_Category/data_storage.txt"
perfume_cologne_df = "hdfs://localhost:9000/abd/All_Category/perfume__cologne.txt"
automotive_tools_df = "hdfs://localhost:9000/abd/All_Category/automotive__tools.txt"
beauty_personal_care_df = "hdfs://localhost:9000/abd/All_Category/beauty__personal_care.txt"
bath_body_df = "hdfs://localhost:9000/abd/All_Category/bath__body.txt"
shaving_hair_removal_products_df = "hdfs://localhost:9000/abd/All_Category/shaving__hair_removal_products.txt"
handmade_jewellery_df = "hdfs://localhost:9000/abd/All_Category/handmade_jewellery.txt"
kids_babies_df = "hdfs://localhost:9000/abd/All_Category/kids__babies.txt"
luggage_travel_gear_df = "hdfs://localhost:9000/abd/All_Category/luggage__travel_gear.txt"
home_decor_df = "hdfs://localhost:9000/abd/All_Category/home__decor.txt"
pets_df = "hdfs://localhost:9000/abd/All_Category/pets.txt"
handmade_kitchen_dining_df = "hdfs://localhost:9000/abd/All_Category/handmade_kitchen__dining.txt"
outdoor_cooking_df = "hdfs://localhost:9000/abd/All_Category/outdoor__cooking.txt"
men_df = "hdfs://localhost:9000/abd/All_Category/men.txt"
women_df = "hdfs://localhost:9000/abd/All_Category/women.txt"
grocery_df = "hdfs://localhost:9000/abd/All_Category/grocery.txt"
work_safety_df = "hdfs://localhost:9000/abd/All_Category/work_safety.txt"
hobbies_crafts_df = "hdfs://localhost:9000/abd/All_Category/hobbies__crafts.txt"
toys_games_df = "hdfs://localhost:9000/abd/All_Category/toys__games.txt"

# Reading text files
fashion_accessories_text = spark.sparkContext.textFile(fashion_accessories_df)
data_storage_text = spark.sparkContext.textFile(data_storage_df)
perfume_cologne_text = spark.sparkContext.textFile(perfume_cologne_df)
automotive_tools_text = spark.sparkContext.textFile(automotive_tools_df)
beauty_personal_care_text = spark.sparkContext.textFile(beauty_personal_care_df)
bath_body_text = spark.sparkContext.textFile(bath_body_df)
shaving_hair_removal_products_text = spark.sparkContext.textFile(shaving_hair_removal_products_df)
handmade_jewellery_text = spark.sparkContext.textFile(handmade_jewellery_df)
kids_babies_text = spark.sparkContext.textFile(kids_babies_df)
luggage_travel_gear_text = spark.sparkContext.textFile(luggage_travel_gear_df)
home_decor_text = spark.sparkContext.textFile(home_decor_df)
pets_text = spark.sparkContext.textFile(pets_df)
handmade_kitchen_dining_text = spark.sparkContext.textFile(handmade_kitchen_dining_df)
outdoor_cooking_text = spark.sparkContext.textFile(outdoor_cooking_df)
men_text = spark.sparkContext.textFile(men_df)
women_text = spark.sparkContext.textFile(women_df)
grocery_text = spark.sparkContext.textFile(grocery_df)
work_safety_text = spark.sparkContext.textFile(work_safety_df)
hobbies_crafts_text = spark.sparkContext.textFile(hobbies_crafts_df)
toys_games_text = spark.sparkContext.textFile(toys_games_df)
In [ ]:
# Function to explore data
def explore_data(data, title):
    print(f"Exploring {title}:")
    print("Number of lines:", data.count())  # Count number of lines
    print("Sample data:")
    for line in data.take(5):  # Display sample data
        print(line)
    print("\n")

# Explore each dataset
explore_data(fashion_accessories_text, "Fashion Accessories")
explore_data(data_storage_text, "Data Storage")
explore_data(perfume_cologne_text, "Perfume & Cologne")
explore_data(automotive_tools_text, "Automotive Tools")
explore_data(beauty_personal_care_text, "Beauty & Personal Care")
explore_data(bath_body_text, "Bath & Body")
explore_data(shaving_hair_removal_products_text, "Shaving & Hair Removal Products")
explore_data(handmade_jewellery_text, "Handmade Jewellery")
explore_data(kids_babies_text, "Kids & Babies")
explore_data(luggage_travel_gear_text, "Luggage & Travel Gear")
explore_data(home_decor_text, "Home Decor")
explore_data(pets_text, "Pets")
explore_data(handmade_kitchen_dining_text, "Handmade Kitchen & Dining")
explore_data(outdoor_cooking_text, "Outdoor Cooking")
explore_data(men_text, "Men")
explore_data(women_text, "Women")
explore_data(grocery_text, "Grocery")
explore_data(work_safety_text, "Work & Safety")
explore_data(hobbies_crafts_text, "Hobbies & Crafts")
explore_data(toys_games_text, "Toys & Games")
Exploring Fashion Accessories:
Number of lines: 315017
Sample data:
Fruit of the Loom Boys' Eversoft Cotton Undershirts, T Shirts & Tank Tops
Hanes Boys' Socks, Double Tough Cushioned Crew Socks, 12-pair Packs
The Children's Place Baby Toddler Boys Long Sleeve Oxford Button Down Shirt
Minecraft Boys' 6-Piece Snug-fit Cotton Pajamas Set
Hanes Boys' Socks, Double Tough Cushioned Ankle and No Show, 12-Pair Packs


Exploring Data Storage:
Number of lines: 22054
Sample data:
Samsung 980 PRO SSD 2TB PCIe NVMe Gen 4 Gaming M.2 Internal Solid State Hard Drive Memory Card, Maximum Speed, Thermal Control, MZ-V8P2T0B, Black
WD_BLACK 2TB SN770 NVMe Internal Gaming SSD Solid State Drive - Gen4 PCIe, M.2 2280, Up to 5,150 MB/s - WDS200T3X0E
Samsung 970 EVO Plus 2TB NVMe M.2 Internal SSD (MZ-V7S2T0B/AM) [Canada Version]
Samsung 970 EVO Plus 1TB NVMe M.2 Internal SSD (MZ-V7S1T0/AM) [Canada Version]
Seagate Storage Expansion Card for Xbox Series X|S 2TB Solid State Drive - NVMe Expansion SSD for Xbox Series X|S (STJR2000400)


Exploring Perfume & Cologne:
Number of lines: 18582
Sample data:
Nautica Voyage Eau De Toilette for Men - Fresh, Romantic, Fruity Scent - Woody, Aquatic Notes of Apple, Water Lotus, Cedarwood, and Musk - Ideal for Day Wear - 3.3 Fl Oz
Pure Instinct CRAVE Roll-On The Original Pheromone Infused Essential Oil Perfume Cologne – For Her - TSA Ready 0.34 fl oz
NIVEA Men Sensitive Skin Cooling After Shave Balm (100mL), Aftershave for Sensitive Skin, No Drying Alcohol, Instantly Soothes & Cools Down Skin After Shaving
PB ParfumsBelcam Vault, our version of Armani Code, EDT Spray, 100 ml (Pack of 1)
2 Pcs Pheromones Perfume for Women,Romantic Pheromone Glitter Perfume,Flirty Aroma Lusting Pheromone Perfume,Essential Oil Perfume with Pheromones for Women to Attracting Men.


Exploring Automotive Tools:
Number of lines: 90159
Sample data:
PUREBURG 2-Pack Replacement HEPA Filters Compatible with Therapure TPP240F Fits Envion TPP240 TPP230 Air Purifiers
"Happybuy Stainless Steel Cable 3/16""x 500ft, T304 Marine Grade Deck Cable Railing, 7x19 Strands Construction Braided Aircraft Cable for Deck Rail String Lights Hanging Porch Fence DIY Baluster"
"HOME STAIRWAY LTD. : Stair Iron Railing - 1/2"" Square Metal Balusters in Satin Black - Box of 10 (Single Collar)"
Govee Life Smart Space Heater, Electric Space Heater with Thermostat, Wi-Fi & Bluetooth App Control, Works with Alexa & Google Assistant, 1500W Ceramic Heater for Bedroom, Indoors, Office, Living Room
LEVOIT Air Purifiers for Bedroom Home, HEPA Freshener Filter Small Room Cleaner with Fragrance Sponge for Smoke, Allergies, Pet Dander, Odor, Dust Remover, Office, Desktop, Table Top, Core Mini, White


Exploring Beauty & Personal Care:
Number of lines: 140876
Sample data:
H2ofloss Cordless Water Dental Flosser, Portable Oral Irrigator for Teeth, Braces, Rechargeable & IPX7 Waterproof Teeth Cleaner for Home Travel
Ionic NanoSteamer - 3-in-1 Facial Steamer with Precise Temp Control - Atomizer - Mist - Humidifier- Unclogs Pores - Blackheads - Spa Quality - NanoSteam
beautyblender blendercleanser solid, 1 ounce cleanser
Brightup Beard Trimmer for Men, Hair Clippers & Hair Trimmer for Men, IPX7 Waterproof Mustache Face Nose Ear Body Shavers Electric Razor Men, Mens Gifts, USB Rechargeable & LCD Display, FK-8688T
Crest 3D White Whitestrips Professional Effects Teeth Whitening Kit, 22 Treatments, 13 Levels Whiter


Exploring Bath & Body:
Number of lines: 17181
Sample data:
Handheld Bath Brush with Long Handle Shower Brush-Soft & Comfortable Dry Skin Body Massage Brush Back Exfoliation Brushes (Blue)
Bath Body Brush with Comfy Bristles Non-slip Long Handle Gentle Exfoliation Improve Skin's Health and Beauty, Shower Brush Back Scrubber for Men and Women Relaxing Spa Massage (Pink)
Silicone Body Scrubber 2pcs, 2 in 1 Bath and Shampoo Brush with Handle, Exfoliating Bath Body Scrub Brush for Shower
High quality dry brush body brush, bath brush 5-piece set, natural bristle long handle bath brush, facial brush, exfoliating bath body brush, foot pumice stone, back cotton linen bath rub, cellulite massage brush for lymphatic detoxification
TXV Mart 100% Natural Exfoliating Sisal Bath Gloves Sponge Scrubber Deeply Clean Remove Dead Skin, Bathroom, Shower, Spa - 1 Pair


Exploring Shaving & Hair Removal Products:
Number of lines: 17769
Sample data:
Philips OneBlade Face & Body Kit with Li-Ion Handle, QP2630/21
eos Shea Better Travel Size Shaving Cream, Pomegranate Raspberry, 24HR Hydration, 74ml
OOCOME Women 4 In 1 Rechargeable Electric Epilator Hair Shaver Lady'S Electric Trimmer Remover Waterproof Razor For Bikini Area Nose Armpit Arm Leg
Eyebrow Razors,Multifunctional Stainless Steel Eyebrow Trimmer for Men and Women, Grooming Shavers/Face Hair Removers/Removal/Shaving Tools Set
Charmonic 17.5 Oz Hair Wax Beans , Hard Body Wax Beans, Hair Removal Depilatory Wax European Beads for Women Men 500g/1.1 lb (Chamomile)


Exploring Handmade Jewellery:
Number of lines: 28666
Sample data:
Earrings for Women Spiral threader earrings 14K gold earrings hand bent dangle earrings for women,suitable for gift giving, perfect for your birthday party, Christmas, gift giving.
Tiny Nose Ring Hoop 20 G Nose Piercings Hoop - 14K Gold Filled Nose Piercings hoop
Smilebelle Evil Eye Necklace Gold 14K Protection Necklace with Zircons as Thanksgiving Gifts, Eye Necklace for Women Handmade Jewelry, Luck Amulet for Protection, Third Eye Necklace Birthday Gift for Her
Fake Clip On Nose Ring 24g - 925 Sterling Silver - No Piercing Needed - Fake Nose Hoop
Spiral Threader Earrings 925 Sterling Silver Twisted Linear Curved Pull Through Earrings


Exploring Kids & Babies:
Number of lines: 101988
Sample data:
Girls 9 Pack Tagless Hipster
Kids Watch, Girls Digital Watch with Alarm/Stopwatch/Distance/Calories/Steps Counter, Watches for Kids Teens Gift for Girls Boys
Girls 9 Pack Tagless Brief
Girls 6 Pack - Toddler Assortment
Lucky Clover Necklace For Women Girls, 18K Gold Plated Cute Fashion Simple Girls Titanium Steel Hypoallergenic Pendant


Exploring Luggage & Travel Gear:
Number of lines: 21444
Sample data:
Secure Travel Money Belt, Undercover Hidden RFID Blocking Travel Wallet, Anti-Theft Passport Wallets for Men Women
Cipway - 5 Set Compression Packing Cubes for Travel, Ultralight Packing Organizers for Luggage Suitcase & Backpack (White), L
Luggage Sets 3 Piece Softside Expandable Lightweight & Durable Suitcase Sets Double Spinner Wheels TSA Lock (20in/24in/28in) Blue
Slim Minimalist Aluminum Wallet for Men/Credit Card Holder for Men with Cash Strap
Windproof Travel Umbrella - Wind Resistant, Small - Compact, Light, Automatic, Strong, Mini, Folding and Portable - Backpack, Car, Purse Umbrellas for Rain - Men and Women


Exploring Home Decor:
Number of lines: 67771
Sample data:
Fall Candles, Pumpkin Spice Candles for Home, Autumn Candle, Pumpkin Candle, Fall Scented Candles for Home, Fall Home Decor, Fall Bathroom Decor, Autumn Decor, Hello Pumpkin, Hello Fall - 9oz
Kim and Pom Pumpkin Spice Candle, Fall Scent, Fall Candles
Get Well Soon Gifts for Women Sympathy Gift Baskets Care Package Self Care gifts for Sick Friends Mom Grandma Wife After Surgery Feel Better Gifts Thinking of You Encouragement Stress Relief Present
Birthday Gifts for Women Sunflower Gifts Sunshine Gifts Baskets for Women Gifts for Friends Female Self Care Package Thinking of You Gift Box for Her Sister Boss Lady Inspirational Get Well Soon Gifts
Handmade in Canada - Wooden Custom Baby Name Signs for Nursery for Boys & Girls - Choose Size, Font, Color - Baby Room Wall Decor - Newborn Essentials Gift - Wood Letters - Personalized Nursery Decor


Exploring Pets:
Number of lines: 18724
Sample data:
"peepeego Upgrade Non-Slip Dog Pads Extra Large 72"" x 72"", Washable Puppy Pads with Fast Absorbent, Reusable, Waterproof for Training, Travel, Whelping, Housebreaking, Incontinence, for Playpen, Crate"
Sure Petcare - SureFeed Microchip Pet Feeder - The Automatic Pet Feeder That Makes Meal Times Stress Free - Helps prevent food stealing - Great for Prescription and Weight Management Diets
Pet N Pet Dog Poop Bags 1080 Counts, Green Dog Bags Poop Bag, USDA Certified 38% Biobased Doggy Poop Bags Dog Bag, Durable Dog Waste Bags Dog Poop Bag, Dog Poo Bags, Pet Poop Bags Dogs Poops Bag
Purina Pro Plan Veterinary Supplements Dog Supplement, FortiFlora Powdered Canine Probiotic - 30 x 1 g Sachets (1 Pack), Brown
Arm & Hammer Clump & Seal Slide Clay Cat Litter, 12.7kg, Odour Control, Dust Free, Clumping Litter


Exploring Handmade Kitchen & Dining:
Number of lines: 18674
Sample data:
Thank You Gifts for Women Spa Thoughtful Unique Gift Basket for Coworkers Nurse Friends Men Boss Employee Secretary Hostess Teacher Mom Her
Personalized Whiskey Glasses - Custom Whiskey Gifts for Men - Old Fashion Rocks Scotch Glass - Birthday, Anniversary, Dad, Boyfriend, Husband, Mens Gifts - Christmas Gift for Men - Mens Xmas Gift
Off Cut & Co. Premium Board Balm Wax - Premium Canadian Beeswax and Mineral Oil Cutting Board Balm Wax- (3.5 oz/ 100g)
Handmade in Canada - Personalized Cutting Board - Unique Wedding Gift Idea for Couples, Anniversary, Bridal Shower, Housewarming - Christmas Gift for Couples - Custom Charcuterie & Cheese Board
Personalized Cutting Boards - Wedding Gifts, House Warming Gifts, Anniversary Gifts for Her & Him, Couples Engagement Gifts - Cheese & Charcuterie Board - Personalized Gifts for Men, Women & Couples


Exploring Outdoor Cooking:
Number of lines: 16976
Sample data:
ThermoPro Waterproof Digital Instant Read Meat Thermometer Kitchen Cooking Food Thermometer with Backlight Steak Oil Fry Candy Thermometer
"Grillman Heavy-Duty BBQ Cover, Gas Grill Cover for Weber Spirit, Weber Genesis, Char Broil, Nexgrill. Rip-Proof, Waterproof (58"" L x 24"" W x 48"" H, Black) BBQ Covers"
SimpleHouseware 55-inch Waterproof Heavy Duty Gas BBQ Grill Cover, Weather-Resistant Polyester
Traeger Grills Signature Blend 100% All-Natural Wood Pellets for Smokers and Pellet Grills. BBQ, Bake, Roast, and Grill, 20 lb. Bag
Grill Cover, BBQ Cover 58 inch,Waterproof BBQ Grill Cover,UV Resistant Gas Grill Cover,Durable and Convenient,Rip Resistant,Black Barbecue Grill Covers,Fits Grills of Weber,Brinkmann,Char-Broil etc (58 Inch)


Exploring Men:
Number of lines: 23235
Sample data:
Polarized Aviator Sunglasses for Men Women Metal Flat Top Sunglasses lightweight Driving UV400 Outdoor 58mm
Semi-Rimless Polarized Sunglasses UV Protection Classic Half Frame Sun Glasses Men Women
Mens Hooded Sweatshirt
Men's Regular-Fit Long-Sleeve Solid Shirt, Black, Large
Mens Classic Relaxed Fit Stretch Cargo Short


Exploring Women:
Number of lines: 25349
Sample data:
Winter Gloves Womens 100% Genuine Leather Touchscreen Warm Driving Gloves
Premium Orthopedic Open Toe Sandlas Anti-Slip Ladies Wedge Faux Leather Sandals Summer Hook and Loop Comfy Sandals Casual Beach Sandals,Brown,US6/EU37
womens Marl Slub Slouch Boot Sock, 4 Pair Pack
Catholic Pink Crystal Beads Gold Rosary Flowers Beaded Necklace Holy Mary Heart Locket Medal & Cross Religious Amulet for Women, Crystal, No Gemstone
8 Pairs Clip on Earrings for Women Dangling Cross Butterfly Star Pearl Clip Dangle Earrings Set Hypoallergenic Clip Long Earrings Non Pierced Piercing Jewelry Silver Gold Tone


Exploring Grocery:
Number of lines: 22912
Sample data:
1LB. 100% Hawaii Hawaiian Kona Extra Fancy Coffee Beans
HERSHEY'S Unsweetened Cocoa Powder for Baking, Chocolate Powder, Gluten Free, 652g - Online Exclusive
1LB. 100% Jamaican Blue Mountain Roasted Coffee
Maynards, Assorted Gummy Candy (Pack of 90), Sour Patch Kids, Fuzzy Peach, Swedish Berries, Swedish Fish, Bulk Candy, Individually Wrapped, Sour Candy, Halloween Candy, 1.12 kg
MARS Variety, Halloween Chocolate Candy Bars, Assorted Fun Size Bars, Bulk Box, 120 Count


Exploring Work & Safety:
Number of lines: 16684
Sample data:
Mens Ripstop Men's Multi-Cargo Scrub Pant
Tuffo unisex baby overalls and coveralls workwear apparel, Blue, 4T Pack of 1 US
White Lab Coats Doctor Workwear - Unisex Lab Coat Scrubs for Woman and Man
Venom Steel Heavy Duty Breathable Coverall L/XL, White
Adjustable Working Cap with Button, Cotton Working Hat Sweatband, Elastic Bandage Tie Back Hats for Women & Men, One Size


Exploring Hobbies & Crafts:
Number of lines: 16802
Sample data:
Keadic 9Pcs Gundam Model Tools Kit Hobby Building Tools Craft Set for Basic Model Building, Repairing and Fixing
LIFEGOO Gundam Model Tools Kit, 42 in 1 Modeler Basic Tools Craft Set Hobby Building Tools Kit with Tool Case Perfect for Gundam Model Building Repairing and Fixing
Magnifier with light and stand with 18 LEDs, 10 frames 30 frames foldable reading magnifier - 3 lighting modes, dimmable large glass hand crank magnifier Desktop magnifier for the elderly, children, reading, inspection, hobby - with lens cloth
Bandai Hobby - Mobile Suit Gundam - HG 1/144 Gundam Barbatos Lupus Model Kit
XINMEIWEN Pieces Shelves Tool Stand Holder Model Production Tools Placement Rack Plastic Holder Container Organizer for Gundam Hobby Model Making Parts (29x18x12cm)


Exploring Toys & Games:
Number of lines: 18990
Sample data:
iHaHa Fire Truck Toys for 1 2 3 4 5 6 Years Old Boys Toddler, 5 in 1 Kids Carrier Toy Birthday, Car Friction Power Toys with Light Sound
Magnetic Tiles Kids Toys STEM Magnet Toys for Toddler Magnetic Blocks Building Toys Preschool Learning Sensory Montessori Toys for 3+ Year Old Boys and Girls, Safe Creativity Toddler Kids Toys
yiyisibao Magnet Toys for 3 Year Old Boys & Girls, Magnetic Blocks STEM Learning Educational Building Blocks for Kids Ages 4-8, Toddler Toys 40 PCS Montessori Toys for 2 Year Old Boys
Drum Set for Kids with 2 Drum Sticks and Microphone, Musical Toys Gift for Toddlers…
Kiddiworld Dinosaur Toys for 3 4 5 Year Old Boys Gifts, Dinosaurs Toys for Kids 3-5-7, Dino Figures Activity Play Mat Christmas Birthday Gifts for Girls Toddler Toys Age 2-4


In [ ]:
# Calculate total number of lines
total_lines = (
    fashion_accessories_text.count() +
    data_storage_text.count() +
    perfume_cologne_text.count() +
    automotive_tools_text.count() +
    beauty_personal_care_text.count() +
    bath_body_text.count() +
    shaving_hair_removal_products_text.count() +
    handmade_jewellery_text.count() +
    kids_babies_text.count() +
    luggage_travel_gear_text.count() +
    home_decor_text.count() +
    pets_text.count() +
    handmade_kitchen_dining_text.count() +
    outdoor_cooking_text.count() +
    men_text.count() +
    women_text.count() +
    grocery_text.count() +
    work_safety_text.count() +
    hobbies_crafts_text.count() +
    toys_games_text.count()
)

print("Total number of lines across all datasets:", total_lines)
Total number of lines across all datasets: 1019853
In [ ]:
# Define ignored words
ignored_words = ['men', 'women', "women's", "men's", 'mens', 'womens', 'for', 'and', 'or', 'x', 'X', 'the', 'in', 'of', 'with', 'on', 'at', 'by', 'to', 'from', 'as', 'is', 'are', 'it', 'be', 'that', 'which', 'this', 'where', 'when', 'how', 'so', 'also', 'will', 'has', 'have', 'but', 'not', 'no', 'may', 'yet', 'you', 'we', 'i', 'a', 'an', '(', ')', '[', ']', '{', '}', ',', '.', ';', ':', '-', '_', '/', '\\', '!', '?', '"', "'", '*', '&', '%', '$', '@', '+', '=', '<', '>', '|']

# Function to perform word count while filtering out ignored words
def word_count(line):
    words = line.lower().split()
    # Remove ignored words
    filtered_words = [word for word in words if word not in ignored_words]
    return [(word, 1) for word in filtered_words]
In [ ]:
# Word count for each text file
word_counts = {
    'fashion_accessories': fashion_accessories_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'data_storage': data_storage_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'perfume_cologne': perfume_cologne_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'automotive_tools': automotive_tools_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'beauty_personal_care': beauty_personal_care_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'bath_body': bath_body_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'shaving_hair_removal_products': shaving_hair_removal_products_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'handmade_jewellery': handmade_jewellery_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'kids_babies': kids_babies_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'luggage_travel_gear': luggage_travel_gear_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'home_decor': home_decor_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'pets': pets_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'handmade_kitchen_dining': handmade_kitchen_dining_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'outdoor_cooking': outdoor_cooking_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'men': men_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'women': women_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'grocery': grocery_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'work_safety': work_safety_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'hobbies_crafts': hobbies_crafts_text.flatMap(word_count).reduceByKey(lambda x, y: x + y),
    'toys_games': toys_games_text.flatMap(word_count).reduceByKey(lambda x, y: x + y)
}
In [ ]:
# Display top 100 words for each dataset
for category, word_count_rdd in word_counts.items():
    print(f"\nTop 100 words for {category}:")
    top_100_words = word_count_rdd.takeOrdered(100, key=lambda x: -x[1])
    for word, count in top_100_words:
        print(f"{word}: {count}")
Top 100 words for fashion_accessories:
watch: 58279
earrings: 33891
girls: 31089
leather: 30158
bag: 27880
jewelry: 27154
necklace: 26727
silver: 24525
black: 21789
gold: 20943
bracelet: 20411
strap: 19887
steel: 19671
gift: 17853
dress: 17513
stainless: 17426
casual: 17176
shoes: 17018
gifts: 16307
fashion: 16274
sleeve: 16148
band: 15821
set: 15744
shoulder: 15198
long: 15115
sterling: 13260
chain: 13072
pendant: 12905
purse: 12855
size: 12840
hat: 12591
quartz: 12588
wedding: 12498
crossbody: 11988
ring: 11735
party: 11120
adjustable: 10910
boots: 10166
waterproof: 10068
sunglasses: 9348
vintage: 9277
watches: 9269
short: 9150
2: 8958
toe: 8758
cotton: 8410
heart: 8264
crystal: 8260
high: 8185
white: 8135
pocket: 8117
shirt: 8060
handbag: 8005
summer: 7971
genuine: 7954
classic: 7895
small: 7886
birthday: 7886
boot: 7769
top: 7714
tote: 7657
women,: 7490
cute: 7470
lightweight: 7376
ladies: 7345
belt: 7309
soft: 7266
neck: 7202
winter: 7173
dangle: 7152
925: 7118
rings: 7109
bracelets: 7102
fit: 7043
watch,: 7031
wallet: 7021
one: 7010
cap: 6989
plated: 6881
hoop: 6851
bags: 6684
stud: 6662
pants: 6654
buckle: 6623
kids: 6619
socks: 6436
wrist: 6407
round: 6358
3: 6335
pack: 6304
shoe: 6285
sports: 6186
wide: 6133
large: 6096
beach: 5932
unisex: 5925
big: 5654
accessories: 5590
slip: 5576
blue: 5565

Top 100 words for data_storage:
drive: 19342
usb: 15520
flash: 8621
hard: 6645
memory: 5909
stick: 5409
ssd: 5064
external: 4231
internal: 3328
thumb: 3322
storage: 3320
sata: 3215
3.0: 3048
portable: 3006
solid: 2817
state: 2806
m.2: 2636
drive,: 2556
2.0: 2531
up: 2241
nvme: 2240
hdd: 2193
drives: 2143
card: 1989
disk: 1961
data: 1930
compatible: 1916
speed: 1825
pen: 1729
64gb: 1726
pcie: 1722
pack: 1719
1tb: 1694
pc: 1689
32gb: 1662
black: 1644
gb: 1617
high: 1508
128gb: 1502
desktop: 1433
1: 1419
laptop: 1369
2: 1334
c: 1326
pro: 1309
2tb: 1278
10: 1267
inch: 1229
ultra: 1123
2.5: 1082
seagate: 1000
jump: 982
16gb: 971
gen: 966
iii: 962
metal: 950
3d: 946
adapter: 902
3: 900
3.2: 897
sd: 894
256gb: 892
4: 891
–: 859
swivel: 845
2280: 844
photo: 826
nand: 823
sandisk: 822
computer: 811
bulk: 809
2.5"": 808
digital: 801
pc,: 786
3.1: 765
cache: 759
6gb/s: 745
type: 743
wd: 740
read: 739
backup: 735
tb: 722
dual: 708
512gb: 703
5: 700
class: 700
blue: 694
android: 689
mac: 681
500gb: 675
samsung: 668
pendrive: 664
4tb: 658
nas: 657
design: 644
performance: 640
ssd,: 623
mb/s: 603
xbox: 603
mini: 602

Top 100 words for perfume_cologne:
perfume: 10522
de: 6145
spray: 5823
eau: 5742
oil: 4847
essential: 3671
fragrance: 3626
lasting: 3288
parfum: 3156
oz: 3133
long: 3046
bottle: 2540
perfume,: 2480
toilette: 2088
solid: 2085
spray,: 1967
pheromone: 1913
3.4: 1869
ml: 1841
cologne: 1740
travel: 1637
portable: 1440
bag: 1371
women,: 1278
storage: 1277
body: 1241
edt: 1058
attar: 1052
gift: 1042
men,: 996
ounce: 949
light: 913
scent: 895
case: 853
set: 835
edp: 826
100: 811
balm: 800
ounces: 800
natural: 792
100ml: 777
floral: 776
oils: 684
pack: 665
organizer: 657
50ml: 653
1.7: 643
unisex: 642
pcs: 624
refreshing: 622
mini: 597
free: 581
1: 565
daily: 561
attract: 544
al: 542
bottles: 538
mist: 529
3: 519
6ml: 517
box: 512
pure: 497
alcohol: 475
perfumes: 460
fruity: 460
aalam: 456
pocket: 449
flower: 438
musk: 431
holder: 427
glass: 426
2: 424
fl: 414
aromatherapy: 400
4: 397
black: 397
fresh: 397
fragrances: 392
3.3: 391
2pcs: 387
rose: 384
dating: 383
elegant: 379
30ml: 368
carrying: 361
suitable: 361
empty: 359
premium: 358
fomiyes: 357
pheromones: 355
nimal: 349
2.5: 346
100%: 339
uses: 335
shave: 333
fragrance,: 333
luxury: 330
refillable: 326
white: 325
oil,: 319

Top 100 words for automotive_tools:
door: 12232
steel: 11203
air: 8216
bit: 7982
drill: 7775
inch: 7585
replacement: 7529
pack: 6778
2: 6593
set: 6458
black: 6315
bearing: 6123
ball: 5664
filter: 5319
kit: 5006
heavy: 4744
tool: 4710
led: 4695
stainless: 4687
duty: 4640
rubber: 4639
4: 4632
compatible: 4496
shank: 4486
1: 4319
router: 4239
wall: 4067
car: 3896
bearings: 3895
wood: 3830
pcs: 3794
double: 3691
metal: 3668
light: 3642
3: 3447
white: 3429
bits: 3408
belt,: 3401
carbide: 3400
tape: 3325
adhesive: 3284
hole: 3268
high: 3258
cabinet: 3225
lock: 3217
powerdrive: 3180
round: 2909
furniture: 2815
uxcell: 2768
groove: 2751
10: 2706
cover: 2683
cutting: 2653
6: 2621
floor: 2531
belt: 2450
kitchen: 2404
v: 2359
home: 2357
waterproof: 2336
deep: 2303
aluminum: 2276
plastic: 2258
hooks: 2227
screw: 2220
cutter: 2218
bathroom: 2200
carbon: 2179
length: 2164
5: 2134
2pcs: 2133
diameter: 2101
cnc: 2094
thread: 2089
self: 2085
wire: 2082
hand: 2081
end: 2074
seal: 2065
front: 2040
holder: 2027
garage: 2024
adjustable: 1997
repair: 1981
chrome: 1958
saw: 1954
tile: 1947
roller: 1929
bore: 1924
window: 1900
set,: 1887
1/2"": 1883
vent: 1869
machine: 1866
switch: 1845
linear: 1839
towel: 1823
speed: 1795
tools: 1780
diy: 1778

Top 100 words for beauty_personal_care:
hair: 52892
face: 18706
skin: 16922
makeup: 16242
ml: 12965
body: 12732
lip: 12525
cream: 11881
eye: 11060
oil: 10959
natural: 10297
de: 9790
oz: 9427
eau: 8749
tattoo: 8733
spray: 8672
black: 8373
brush: 8009
set: 7811
long: 7289
pack: 6537
2: 6507
perfume: 6445
gel: 6398
1: 6134
girls: 5987
waterproof: 5913
facial: 5854
dry: 5781
care: 5716
professional: 5596
eyebrow: 5573
lash: 5547
eyelash: 5537
nail: 5524
pcs: 5505
kit: 5451
powder: 5134
shampoo: 4990
liquid: 4901
color: 4814
lasting: 4778
serum: 4738
clips: 4737
free: 4710
3: 4692
mask: 4604
all: 4530
accessories: 4409
parfum: 4404
dark: 4243
100ml: 4127
matte: 4125
beauty: 4090
gift: 4086
soft: 4066
moisturizing: 4057
party: 4020
wig: 3949
temporary: 3948
vegan: 3942
4: 3931
lotion: 3903
travel: 3867
lashes: 3859
halloween: 3850
fragrance: 3811
tattoos: 3801
styling: 3739
comb: 3706
fl: 3699
vitamin: 3573
lipstick: 3514
6: 3469
eyeshadow: 3438
hydrating: 3431
balm: 3409
–: 3385
toilette: 3377
100: 3295
glitter: 3288
hair,: 3286
(pack: 3278
remover: 3267
cream,: 3256
50ml: 3213
tool: 3199
eyeliner: 3193
foundation: 3187
skin,: 3166
up: 3162
mascara: 3161
conditioner: 3059
fake: 3053
stick: 3042
50: 3042
kids: 3032
organic: 2988
extension: 2947
extensions: 2947

Top 100 words for bath_body:
bath: 8119
shower: 4369
body: 4310
soap: 3695
baby: 3679
exfoliating: 1989
hair: 1947
wash: 1728
skin: 1672
natural: 1655
brush: 1583
scrubber: 1544
toys: 1523
towel: 1493
sponge: 1464
oz: 1421
cap: 1375
soft: 1371
pack: 1360
deodorant: 1290
kids: 1194
toy: 1192
back: 1185
hand: 1169
bathtub: 1132
2: 1049
wipes: 1032
spa: 987
bar: 985
loofah: 860
towels: 847
3: 840
pcs: 835
face: 830
long: 827
sweat: 759
rubber: 745
set: 726
4: 724
soap,: 720
(pack: 711
gift: 688
bonnet: 687
duck: 676
pillow: 668
oil: 659
mesh: 642
6: 633
dry: 629
cotton: 618
bathroom: 587
bathing: 578
silicone: 573
ounce: 565
water: 559
1: 553
suction: 547
wipe: 545
shampoo: 539
tub: 539
organic: 538
satin: 534
caps: 516
scrub: 513
home: 510
wet: 510
girls: 509
black: 507
100%: 507
ml: 506
hat: 499
bag: 499
large: 496
cleaning: 495
gloves: 492
gel: 489
massage: 485
silk: 483
travel: 482
sleep: 474
free: 461
pure: 457
care: 457
absorbent: 457
net: 451
white: 450
reusable: 447
toddler: 445
all: 435
hooded: 432
waterproof: 426
gifts: 413
kitchen: 411
infant: 403
neck: 403
underarm: 401
boys: 401
cleansing: 399
pads: 394
handle: 387

Top 100 words for shaving_hair_removal_products:
hair: 10783
beard: 5766
razor: 5582
shaver: 5244
electric: 4867
removal: 4099
wax: 4066
shaving: 3679
trimmer: 3342
tweezers: 3018
nose: 2181
eyebrow: 2131
replacement: 2036
brush: 2007
portable: 1942
facial: 1884
tool: 1800
head: 1789
steel: 1687
stainless: 1641
remover: 1513
travel: 1507
kit: 1506
face: 1495
body: 1474
rechargeable: 1396
mini: 1364
set: 1289
1: 1276
blades: 1230
shave: 1225
professional: 1089
blade: 1081
grooming: 1066
men,: 1027
usb: 1002
series: 985
painless: 972
home: 957
waxing: 948
razors: 943
safety: 941
epilator: 931
eyelash: 925
case: 909
double: 901
bowl: 895
3: 894
cream: 888
2: 825
shavers: 822
tweezer: 807
philips: 805
edge: 771
foil: 765
skin: 763
waterproof: 756
wet: 755
soap: 725
pcs: 714
depilatory: 707
shaver,: 704
tools: 703
trimmer,: 689
use: 684
beauty: 678
comb: 671
mustache: 670
norelco: 658
braun: 657
dry: 652
precision: 645
holder: 641
black: 633
handle: 631
5: 623
crystal: 622
bikini: 620
compatible: 619
storage: 606
stand: 585
machine: 585
pocket: 578
heads: 569
fomiyes: 565
cleaning: 564
care: 563
strips: 561
power: 559
4: 558
cordless: 558
warmer: 547
pack: 545
scissors: 531
easy: 526
oil: 517
razor,: 508
women,: 507
legs: 504
barber: 502

Top 100 words for handmade_jewellery:
ring,: 16262
silver: 13328
sterling: 9417
gift: 9093
jewelry: 8818
ring: 8783
earrings: 8082
handmade: 7405
gold: 6533
necklace: 6208
925: 6156
gemstone: 4949
pendant: 4038
bracelet: 3593
rings: 3420
birthstone: 3119
blue: 2905
stone: 2862
natural: 2789
necklace,: 2756
plated: 2610
statement: 2420
pendant,: 2340
jewelry,: 2310
gifts: 2251
black: 2222
earrings,: 2153
her: 2117
solid: 2014
wedding: 1920
crystal: 1898
charm: 1809
box: 1806
14k: 1748
fashion: 1702
bracelet,: 1627
green: 1598
jewellery: 1588
rose: 1568
turquoise: 1554
band: 1547
dainty: 1486
day: 1447
white: 1405
onyx: 1402
•: 1373
pin: 1366
birthday: 1351
women,: 1350
unique: 1343
stud: 1333
boho: 1318
custom: 1271
oval: 1271
girls: 1254
anemone: 1229
quartz: 1209
set: 1198
designer: 1179
dangle: 1178
chain: 1144
healing: 1141
veracity: 1140
heart: 1135
pink: 1131
round: 1122
drop: 1077
amethyst: 1064
engagement: 1019
18k: 1011
diamond: 989
christmas: 987
gift,: 984
moonstone: 982
size: 978
red: 945
adjustable: 934
anniversary: 912
glass: 912
promise: 907
shape: 905
vintage: 881
yellow: 866
flower: 865
wish: 848
hoop: 838
silver,: 836
personalized: 780
nose: 762
mothers: 759
rings,: 758
bar: 755
pearl: 751
genuine: 750
name: 743
bead: 741
cute: 737
style: 727
her,: 723
beads: 721

Top 100 words for kids_babies:
baby: 43808
girls: 34185
kids: 27612
boys: 25838
toddler: 17796
toys: 15452
set: 10789
sleeve: 9901
toy: 9867
girl: 9511
long: 8903
infant: 7708
cotton: 7647
gift: 7295
shoes: 6895
winter: 6706
years: 6594
boy: 6592
2: 6358
newborn: 6318
dress: 6295
3: 6120
clothes: 6116
gifts: 6030
soft: 5963
birthday: 5733
months: 5684
socks: 5361
pants: 5331
pack: 4983
toddlers: 4583
unisex: 4519
warm: 4430
hat: 4354
bath: 4338
6: 4321
necklace: 4284
cute: 4274
children: 4051
party: 4019
1: 3956
short: 3948
sensory: 3942
shirt: 3894
old: 3882
year: 3748
outfits: 3658
fleece: 3617
t-shirt: 3599
summer: 3595
outfit: 3593
romper: 3541
learning: 3508
jacket: 3487
top: 3482
christmas: 3444
little: 3383
wooden: 3336
4: 3325
hooded: 3267
up: 3201
black: 3197
tops: 3191
jewelry: 3188
montessori: 3169
school: 3117
educational: 3062
shorts: 2949
size: 2916
casual: 2775
12: 2765
sweatshirt: 2725
toys,: 2703
5: 2700
bodysuit: 2690
hoodie: 2655
babies: 2639
play: 2617
princess: 2588
cartoon: 2573
print: 2551
white: 2547
halloween: 2519
piece: 2479
coat: 2478
adjustable: 2477
silicone: 2386
suit: 2367
waterproof: 2339
activity: 2329
one: 2329
bracelet: 2315
age: 2307
rubber: 2279
boots: 2257
color: 2224
car: 2199
silver: 2187
solid: 2145
animal: 2116

Top 100 words for luggage_travel_gear:
bag: 13110
travel: 6056
backpack: 4185
wallet: 3529
leather: 3401
bags: 3248
shoulder: 3244
luggage: 3035
purse: 3021
tote: 2995
crossbody: 2588
keychain: 2565
gifts: 2313
card: 2238
gift: 2237
large: 1800
holder: 1785
key: 1721
small: 1631
laptop: 1583
handbag: 1544
bag,: 1537
pack: 1505
rfid: 1366
black: 1348
waterproof: 1335
girls: 1323
suitcase: 1201
inch: 1163
school: 1100
canvas: 1090
shopping: 1087
strap: 1083
power: 1048
mini: 1045
lightweight: 1041
pocket: 1000
case: 969
cute: 968
organizer: 921
set: 920
2: 918
kids: 908
zipper: 901
women,: 899
blocking: 874
credit: 868
adjustable: 859
clutch: 836
carry: 828
gym: 820
adapter: 818
slim: 813
birthday: 806
hiking: 798
duffel: 797
pouch: 793
chain: 774
handbags: 760
coin: 742
one: 738
umbrella: 720
packing: 714
sling: 704
reusable: 691
daypack: 671
storage: 669
casual: 668
pu: 655
beach: 653
business: 652
messenger: 646
size: 644
purses: 644
dc: 620
id: 617
car: 603
black,: 598
grocery: 589
genuine: 586
converter: 581
belt: 579
sports: 575
pcs: 569
3: 565
tags: 564
christmas: 563
bifold: 556
work: 553
portable: 552
keyring: 550
fashion: 545
clear: 542
cover: 537
phone: 537
wallets: 535
duffle: 534
usb: 533
ring: 530
supply: 520

Top 100 words for home_decor:
wall: 19317
print: 18490
art: 18272
light: 16444
led: 15467
bulb: 10271
decor: 10095
gift: 9929
white: 9264
home: 8497
poster: 8131
bulbs: 6980
pack: 6335
handmade: 5697
sign: 5514
warm: 5206
bulb,: 4571
unframed: 4468
painting: 4426
christmas: 4278
bulbs,: 4018
name: 3938
dimmable: 3753
skyline: 3742
house: 3618
personalized: 3583
custom: 3518
halogen: 3368
candle: 3348
inspired: 3258
personalised: 3160
black: 3138
base: 3122
canvas: 3113
poster,: 2990
2: 2973
lamp: 2964
modern: 2921
metal: 2847
clock: 2753
room: 2741
watt: 2691
vintage: 2649
edison: 2640
dog: 2593
equivalent: 2555
gifts: 2553
art,: 2499
print,: 2355
decoration: 2350
glass: 2322
(unframed): 2312
energy: 2283
original: 2219
watercolor: 2206
wood: 2202
address: 2176
gift,: 2144
plaque: 2136
replacement: 2125
acrylic: 2081
screw: 2052
decor,: 2046
3: 2043
wax: 2020
e26: 2016
6: 2008
box: 1994
4: 1986
lighting: 1980
wooden: 1919
clear: 1917
frame: 1908
wedding: 1840
set: 1831
number: 1827
tree: 1772
2700k: 1766
nursery: 1741
white,: 1730
base,: 1728
fine: 1708
saving: 1707
quote: 1702
door: 1686
design: 1676
family: 1670
filament: 1666
scented: 1655
10: 1631
small: 1629
e27: 1626
baby: 1605
framed: 1579
daylight: 1577
fluorescent: 1570
equivalent,: 1558
cool: 1553
inch: 1501
soy: 1452

Top 100 words for pets:
dog: 13648
cat: 8139
pet: 5736
dogs: 4029
small: 2789
food: 2415
large: 2373
cats: 2192
toys: 1883
puppy: 1706
natural: 1669
medium: 1664
toy: 1615
water: 1604
pack: 1527
collar: 1306
dogs,: 1302
fish: 1287
training: 1239
2: 1236
chew: 1124
dry: 937
chicken: 934
indoor: 932
adjustable: 926
litter: 914
pets: 914
grooming: 913
aquarium: 910
bed: 896
soft: 892
interactive: 885
treats: 883
3: 836
1: 804
kitten: 797
cats,: 770
bird: 752
mat: 732
bag: 714
–: 702
free: 690
4: 670
waterproof: 669
food,: 669
treat: 656
hair: 651
ball: 637
bowl: 633
adult: 628
seat: 620
tank: 611
brush: 609
all: 606
kg: 597
fountain: 595
feeder: 583
car: 575
long: 542
easy: 538
cleaning: 538
durable: 536
extra: 532
flea: 522
toys,: 518
light: 516
wet: 516
up: 501
filter: 498
100%: 493
scratching: 492
g: 488
washable: 479
premium: 476
black: 469
safety: 467
6: 464
plush: 457
(pack: 456
steel: 455
stainless: 453
dental: 451
toy,: 445
bags: 445
skin: 443
spray: 440
outdoor: 433
box: 430
animal: 426
leash: 416
chews: 411
feeding: 408
set: 407
remover: 399
pad: 397
paw: 395
high: 391
complete: 389
pcs: 388
cover: 386

Top 100 words for handmade_kitchen_dining:
gift: 11987
mug: 11274
coffee: 10455
15oz: 8296
11oz: 8159
black: 5195
lovers: 4789
tea: 4702
mug,: 4367
cup: 4127
dog: 3781
color: 3639
gifts: 2954
white: 2692
cup,: 2322
funny: 2321
birthday: 2178
pet: 2040
cat: 1923
oz: 1765
idea: 1702
cake: 1501
blackmug,: 1392
changing: 1370
design: 1328
ceramic: 1317
day: 1317
inner: 1294
accent: 1286
topper,: 1234
kids: 1056
personalized: 1027
12oz: 957
love: 946
custom: 853
my: 822
tasse: 820
cute: 810
quote: 806
ideas: 769
illustration: 767
lover: 765
vintage: 744
unique: 740
11: 738
mom: 727
dad: 726
15: 726
i'm: 699
christmas: 677
fitness: 665
gift,: 657
perfect: 651
scottish: 651
day,: 651
clan: 645
crest: 632
silver: 622
gym: 616
wine: 603
fans: 601
enamel: 584
animal: 567
elegance: 563
atelier: 562
teacher: 555
present: 526
badge: 526
tumbler: 524
school: 512
steel: 510
humor: 500
halloween: 496
anniversary: 495
cutting: 479
old: 473
engraved: 459
mugs: 458
customized: 454
board,: 450
name,: 449
travel: 434
bulldog: 427
topper: 426
proud: 421
wooden: 409
best: 396
name: 389
mugs,: 383
who: 380
14oz: 376
steinless: 375
pride: 372
happy: 372
lovers,: 369
unicorn: 368
gifts,: 365
firefighter: 362
dad,: 360
owners: 357

Top 100 words for outdoor_cooking:
grill: 9378
bbq: 7186
steel: 3646
gas: 3502
stainless: 3455
outdoor: 3234
barbecue: 3133
cover: 2272
camping: 2126
replacement: 1907
charcoal: 1853
cooking: 1851
weber: 1801
picnic: 1728
propane: 1715
portable: 1494
heat: 1462
grilling: 1403
set: 1392
smoker: 1390
cooler: 1360
oven: 1347
bag: 1323
burner: 1302
accessories: 1265
waterproof: 1248
2: 1219
wood: 1189
kit: 1103
inch: 1081
griddle: 1032
black: 1004
heavy: 1001
grill,: 999
duty: 993
skewers: 979
grills: 976
fire: 960
pizza: 959
basket: 948
brush: 909
large: 908
pack: 900
meat: 818
handle: 814
kitchen: 804
parts: 795
cover,: 736
4: 734
hose: 727
resistant: 721
3: 717
mat: 677
cleaning: 639
tool: 622
series: 619
pit: 617
insulated: 604
iron: 594
storage: 582
tank: 560
1: 557
rack: 555
genesis: 546
cast: 543
regulator: 543
plate: 523
premium: 519
smoking: 513
silicone: 512
garden: 508
pellet: 507
metal: 494
long: 489
food: 488
travel: 480
electric: 474
spirit: 469
beach: 467
patio: 464
adapter: 462
natural: 456
adjustable: 452
apron: 449
chicken: 448
fits: 448
reusable: 447
chef: 444
grate: 440
ii: 440
top: 435
blackstone: 434
quick: 432
blanket: 430
box: 423
tools: 421
grills,: 420
flat: 417
baking: 415
thermometer: 412

Top 100 words for men:
cotton: 2388
casual: 2348
sleeve: 2321
shirt: 2280
socks: 2140
long: 1844
fit: 1779
leather: 1707
shorts: 1595
black: 1401
pants: 1377
short: 1356
hat: 1352
shirts: 1349
jacket: 1293
t-shirt: 1209
watch: 1177
unisex: 1172
cap: 1148
classic: 1147
belt: 1136
pack: 1120
winter: 1106
fleece: 1087
sports: 1015
lightweight: 1007
slim: 987
work: 975
size: 957
soft: 940
shoes: 937
breathable: 866
top: 861
pocket: 858
set: 855
outdoor: 830
warm: 824
underwear: 824
running: 821
pockets: 810
sunglasses: 805
steel: 777
hoodie: 771
adjustable: 755
button: 749
3: 748
boxer: 739
2: 739
neck: 734
zip: 708
dress: 683
waterproof: 682
stretch: 681
summer: 669
gift: 654
trousers: 634
polo: 633
hooded: 632
quick: 621
tie: 612
pairs: 597
strap: 595
vest: 590
sweatshirt: 583
dry: 574
wallet: 565
stainless: 563
up: 557
athletic: 554
elastic: 553
boots: 550
pullover: 548
solid: 547
vintage: 546
card: 545
gym: 538
business: 519
hiking: 510
coat: 510
beach: 508
tops: 502
glasses: 500
men,: 494
briefs: 492
crew: 491
wedding: 490
uk: 488
down: 488
fashion: 483
cargo: 479
safety: 477
protection: 468
6: 458
golf: 453
100%: 450
sport: 448
thermal: 443
wide: 442
t: 439
big: 435

Top 100 words for women:
earrings: 4409
long: 3429
ladies: 3367
sleeve: 3089
dress: 2604
silver: 2340
casual: 2335
high: 2220
socks: 2180
tops: 2154
girls: 2153
set: 2023
necklace: 1991
neck: 1968
gold: 1952
bra: 1854
size: 1812
waist: 1656
top: 1642
jewelry: 1641
soft: 1551
winter: 1509
lace: 1508
sterling: 1446
cotton: 1439
party: 1438
summer: 1367
bracelet: 1324
pants: 1318
short: 1302
gifts: 1228
gift: 1180
pairs: 1178
warm: 1169
up: 1161
hoop: 1131
wedding: 1104
sexy: 1096
black: 1094
vintage: 1021
belt: 1019
pack: 1013
adjustable: 1005
plus: 1004
fashion: 1000
2: 977
v: 955
chain: 948
women,: 948
pockets: 938
shirt: 936
control: 920
heart: 907
hat: 895
shorts: 886
hair: 877
piece: 872
fleece: 861
lightweight: 857
dresses: 856
925: 854
leather: 846
costume: 846
shirts: 835
stud: 833
elastic: 830
tummy: 822
leggings: 817
loose: 817
skirt: 814
wide: 808
shoes: 807
stretch: 798
sunglasses: 798
halloween: 797
3: 793
plated: 772
yoga: 758
uk: 747
one: 747
strap: 743
underwear: 743
tights: 741
sleeveless: 736
beach: 716
fit: 699
jacket: 686
seamless: 678
knit: 671
birthday: 667
bracelets: 666
headband: 663
pendant: 634
floral: 632
dangle: 631
slip: 631
waisted: 631
crystal: 619
glasses: 619
open: 618

Top 100 words for grocery:
organic: 6324
tea: 4882
coffee: 2001
natural: 1975
pack: 1945
(pack: 1740
free: 1578
chocolate: 1516
1: 1358
powder: 1201
100%: 1196
gluten: 1195
sugar: 1165
bags: 1109
premium: 1060
black: 1023
count: 1015
g: 1011
oz: 1008
whole: 976
vegan: 925
green: 920
–: 897
tea,: 873
free,: 869
12: 805
vegan,: 798
leaf: 779
bag: 761
2: 760
non-gmo,: 757
grams: 752
protein: 745
dried: 726
certified: 726
pure: 720
white: 694
herbal: 683
fruit: 672
food: 658
1kg: 657
candy: 652
ml: 649
cake: 646
original: 644
6: 643
hot: 632
low: 616
dark: 607
coffee,: 594
mix: 592
24: 588
high: 586
made: 586
milk: 571
3: 567
all: 566
4: 555
gift: 537
ounce: 535
salt: 533
box: 533
ground: 523
loose: 516
20: 508
bulk: 505
quality: 496
red: 490
non-gmo: 488
medium: 485
keto: 484
raw: 478
100: 477
1): 472
coconut: 470
yupik: 466
gourmet: 461
oil: 459
sweet: 453
rice: 452
organic,: 444
vanilla: 439
variety: 437
instant: 437
roast: 435
powder,: 434
kosher,: 433
10: 429
gram: 418
blend: 412
5: 408
8: 406
snack: 405
16: 404
kg: 402
butter: 397
drink: 393
perfect: 390
lemon: 385
roast,: 385

Top 100 words for work_safety:
sleeve: 3327
hats: 3053
working: 2870
pants: 2655
scrub: 2474
work: 2382
cap: 2332
casual: 2298
adjustable: 2230
jacket: 2206
chef: 2171
short: 2153
long: 2075
2: 1828
sweatband: 1774
shirt: 1672
tops: 1636
size: 1583
top: 1511
hat: 1495
uniform: 1473
unisex: 1457
fit: 1414
back: 1334
coat: 1333
tie: 1300
bouffant: 1281
buttons: 1280
shirts: 1258
summer: 1251
neck: 1175
pockets: 1166
loose: 1149
elastic: 1069
set: 1063
cargo: 1025
pant: 1017
cotton: 1007
v-neck: 991
button: 975
caps: 972
scrubs: 948
pocket: 939
high: 919
workwear: 839
print: 817
warm: 815
heated: 774
pack: 767
trousers: 764
v: 763
winter: 754
graduation: 750
tactical: 744
solid: 734
vest: 698
military: 689
dress: 676
shorts: 650
printed: 649
plus: 647
outdoor: 645
nurse: 641
heating: 634
waist: 634
lightweight: 625
pieces: 610
head: 605
usb: 604
fashion: 602
soft: 593
kitchen: 589
black: 586
t-shirt: 586
safety: 585
2023: 571
stretch: 571
t: 566
leg: 562
color: 557
blouse: 549
front: 543
hooded: 538
zipper: 527
overalls: 522
one: 517
medical: 499
scarf: 499
cute: 487
relaxed: 481
slim: 479
floral: 469
nursing: 468
(color: 466
pattern: 453
jogger: 445
breathable: 432
outerwear: 428
drawstring: 427
charging: 427

Top 100 words for hobbies_crafts:
model: 6762
figure: 3650
funko: 2291
gift: 2210
kit: 2148
scale: 2086
pop!: 2085
vinyl: 2046
kids: 2015
toys: 1961
collectable: 1925
official: 1888
idea: 1838
merchandise: 1821
adults: 1701
fans: 1612
car: 1280
card: 1214
collectors: 1171
set: 1162
rc: 1091
plastic: 1076
display: 1069
scissors: 1038
cutting: 884
metal: 817
hobby: 745
gundam: 729
diecast: 689
tamiya: 688
toy: 677
mini: 654
bandai: 639
collection: 626
cutter: 608
movies: 586
craft: 578
star: 547
building: 540
paint: 516
miniature: 513
sewing: 498
train: 485
diy: 480
trading: 478
kit,: 476
black: 449
action: 444
anime: 439
airplane: 438
scenery: 433
2: 433
tool: 432
plane: 430
vallejo: 428
pack: 422
tools: 417
cards: 409
tv: 403
mat: 388
acrylic: 385
figures: 383
alloy: 379
aircraft: 378
revell: 359
steel: 358
railway: 355
box: 354
glass: 346
3: 332
models: 331
1: 329
rotary: 325
–: 324
sleeves: 316
ho: 314
red: 308
fabric: 307
4: 306
green: 304
airfix: 304
color: 303
crawler: 296
game: 295
decoration: 294
wars: 292
ship: 287
military: 281
track: 279
grass: 279
games: 275
scissors,: 274
compatible: 272
1/144: 272
binder: 265
white: 261
knife: 261
army: 260
pcs: 258
stainless: 257

Top 100 words for toys_games:
toys: 12101
kids: 11581
toy: 7130
girls: 6088
boys: 5928
ages: 5243
3: 4057
old: 3914
set: 3673
4: 3561
gifts: 3554
gift: 3518
birthday: 3222
6: 3215
year: 3166
5: 2867
baby: 2805
2: 2795
age: 2705
play: 2700
toddler: 2603
learning: 2594
educational: 2534
building: 2452
toddlers: 2428
up: 2301
kit: 2167
game: 2159
8: 2143
7: 2104
years: 2083
kids,: 1867
party: 1750
car: 1747
toys,: 1691
wooden: 1679
stem: 1596
1: 1573
montessori: 1305
sensory: 1290
games: 1283
12: 1282
9: 1236
10: 1190
blocks: 1165
puzzle: 1145
adults: 1127
girl: 1120
toy,: 1095
board: 1091
remote: 1087
dinosaur: 1085
pretend: 1084
animal: 1083
plush: 1080
preschool: 1059
–: 1056
pcs: 1036
rc: 1013
magnetic: 1004
puzzles: 999
mini: 993
pack: 989
control: 980
set,: 974
christmas: 973
girls,: 971
water: 968
fun: 937
boy: 933
bath: 919
outdoor: 898
diy: 893
light: 855
fidget: 824
soft: 813
children: 791
stuffed: 768
construction: 753
3+: 751
ball: 746
kitchen: 736
months: 724
science: 710
electric: 708
led: 673
doll: 664
truck: 656
cars: 655
crafts: 653
11: 644
pool: 638
toddlers,: 634
activity: 620
playset: 613
travel: 599
model: 583
robot: 583
action: 571
kits: 568
In [ ]:
# Define the output directory in HDFS
output_directory = "hdfs://localhost:9000/abd/wordcount_results/"

# Save word count results for each dataset to HDFS
for category, word_count_rdd in word_counts.items():
    # Construct the output path for the current category
    output_path = output_directory + category
    # Save the word count results to HDFS
    word_count_rdd.map(lambda x: f"{x[0]}: {x[1]}").saveAsTextFile(output_path)
In [ ]:
# Define the directory where word count results are saved
wordcount_directory = "hdfs://localhost:9000/abd/wordcount_results/"

# Read a sample of the word count results for each dataset from HDFS
for category in word_counts.keys():
    # Construct the path to the directory containing word count results for the current category
    category_directory = wordcount_directory + category
    # Read the first few lines from one of the files in the directory
    sample = spark.read.text(category_directory).head(5)
    # Print the sample for the current category
    print(f"\nSample of word count results for {category}:")
    for row in sample:
        print(row['value'])
Sample of word count results for fashion_accessories:
eversoft: 28
cotton: 8410
shirts: 4713
double: 2966
crew: 2506

Sample of word count results for data_storage:
pro: 1309
ssd: 5064
pcie: 1722
nvme: 2240
4: 891

Sample of word count results for perfume_cologne:
nautica: 36
eau: 5742
romantic,: 5
notes: 113
apple,: 11

Sample of word count results for automotive_tools:
replacement: 7529
filters: 1502
tpp240f: 1
fits: 1476
envion: 5

Sample of word count results for beauty_personal_care:
cordless: 593
water: 2407
flosser,: 11
rechargeable: 728
ipx7: 90

Sample of word count results for bath_body:
brush: 1583
brush-soft: 2
dry: 629
body: 4310
exfoliation: 79

Sample of word count results for shaving_hair_removal_products:
philips: 805
face: 1495
body: 1474
li-ion: 9
handle,: 92

Sample of word count results for handmade_jewellery:
earrings: 8082
threader: 29
hand: 265
bent: 2
women,suitable: 1

Sample of word count results for kids_babies:
girls: 34185
tagless: 40
kids: 27612
watch,: 187
digital: 511

Sample of word count results for luggage_travel_gear:
belt,: 27
hidden: 66
rfid: 1366
5: 238
set: 920

Sample of word count results for home_decor:
fall: 253
spice: 47
scented: 1655
home: 8497
decor,: 2046

Sample of word count results for pets:
"peepeego: 1
upgrade: 27
non-slip: 226
dog: 13648
pads: 378

Sample of word count results for handmade_kitchen_dining:
thank: 29
gifts: 2954
spa: 1
thoughtful: 12
unique: 740

Sample of word count results for outdoor_cooking:
thermopro: 11
digital: 185
instant: 174
read: 14
thermometer: 412

Sample of word count results for men:
sunglasses: 805
metal: 383
top: 861
lightweight: 1007
driving: 317

Sample of word count results for women:
winter: 1509
100%: 195
genuine: 85
touchscreen: 33
driving: 67

Sample of word count results for grocery:
1lb.: 5
extra: 342
coffee: 2001
beans: 273
hershey's: 27

Sample of word count results for work_safety:
ripstop: 136
coveralls: 129
blue,: 232
4t: 2
us: 120

Sample of word count results for hobbies_crafts:
keadic: 16
9pcs: 9
tools: 417
hobby: 745
building: 540

Sample of word count results for toys_games:
ihaha: 9
truck: 656
toys: 12101
3: 4057
4: 3561
In [ ]:
import matplotlib.pyplot as plt

# Function to plot bar chart for top words
def plot_bar_chart(category, top_words):
    plt.figure(figsize=(10, 6))
    words = [word for word, _ in top_words]
    counts = [count for _, count in top_words]
    plt.barh(words, counts, color='skyblue')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.title(f'Top 10 words for {category}')
    plt.gca().invert_yaxis()  # Invert y-axis to display highest count at the top
    plt.show()

# Plot bar charts for top 10 words in each category
for category, word_count_rdd in word_counts.items():
    top_10_words = word_count_rdd.takeOrdered(10, key=lambda x: -x[1])
    plot_bar_chart(category, top_10_words)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Function to generate word cloud for top words
def generate_word_cloud(category, top_words):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(top_words))
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {category}')
    plt.show()

# Generate word cloud for top 100 words in each category
for category, word_count_rdd in word_counts.items():
    top_100_words = word_count_rdd.takeOrdered(100, key=lambda x: -x[1])
    generate_word_cloud(category, top_100_words)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image